---
title: "Figure 1 and 2"
lang: en
format:
  html:
    echo: false
    toc: true
    toc-depth: 2
    toc-location: left
    embed-resources: true
---

```{r setup, include = FALSE}
library(dplyr)
library(here)
library(purrr)
library(broom)
library(magrittr)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(stargazer)
```


```{r}
# Import data
data <- haven::read_dta(here::here("1_Data", "master_data.dta"))
```


## Fig 1: Survey results 1

### Panel A
```{r rows.print = 15}
# Prep and show data
(data_panel_a <- data %>% 
   filter(!is.na(tails_w1)) %>% 
   group_by(tails_w1) %>% 
   tally() %>%
   mutate(share = round(n / sum(n), 3),
          exp_share = dbinom(tails_w1, size = 10, prob = 0.5))
)

# Create Figure
(fig_a <- ggplot(data_panel_a,
                 aes(x = as.factor(tails_w1),
                     y = share)) +
   geom_bar(stat = "identity",
            fill = "#377eb8") + # Color from https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
   geom_point(aes(x = tails_w1 + 1, y = exp_share), color = "darkgray") +
   geom_line(aes(x = tails_w1 + 1, y = exp_share), color = "darkgray", linetype = "dashed") +
   scale_y_continuous(limits = c(0, 0.35),
                      expand = expansion(0, 0),
                      labels = scales::percent_format()) +
   labs(x = "Reported number of tails",
        y = "Respondents") +
   theme(panel.background = element_blank(),
         axis.line = element_line(color = "black"),
         axis.ticks.x = element_blank())
)
```


### Panel B
```{r}
# Obtain the likelihood for each reoorted tails 
truthtelling_probs <- tibble(
  tails = 0:10,
  prob_binom = dbinom(0:10,10,0.5))

# Merge and prep data
data_panelc <- data %>%
  filter(!is.na(tails_w1)) %>%
  group_by(tails_w1) %>% 
  summarize(covid_positive = mean(ever_positive, na.rm = TRUE)) %>%
  left_join(y = truthtelling_probs,
            by = c("tails_w1" = "tails"))

# Create Figure
(fig_b <- data_panelc %>% 
    ggplot(aes(x = prob_binom,
               y = covid_positive)) +
    geom_smooth(method = lm,
                se = TRUE,
                color = "#e41a1c",
                fill = "lightgray") +
    geom_point(color = "black") +
    scale_y_continuous(
                       breaks = seq(0, 0.15, 0.05),
                       labels = scales::percent_format(accuracy = 1)) +
    scale_x_continuous(
                       breaks = seq(0, 0.25, 0.05),
                       labels = scales::percent_format(accuracy = 1)) +
    # coord_fixed(ylim = c(0,0.15),
    #             xlim = c(0, 0.26)) +
    labs(x = "Likelihood of rep. num. of tails",
         y = "COVID-19 infection") +
    theme(panel.background = element_blank(),
          axis.line = element_line(color = "black"),
          axis.ticks.x = element_blank(),
          #aspect.ratio = 0.4
          )
)
```


### Panel C
```{r}
# Show mean infections
data %>% 
  group_by(ten_tails_w1) %>% 
  summarize(n = n(),
            mean_ever_positive = round(mean(ever_positive, na.rm = TRUE), 4))
# ten_tails_w1 n mean_ever_positive
# 0	2612	0.0505
# 1	111	0.1261

 # Create Figure
(fig_c <- ggplot(data %>% filter(!is.na(ten_tails_w1)),
       aes(x = factor(ten_tails_w1, labels = c("tails < 10", "tails = 10")),
           y = ever_positive)) +
  stat_summary(fun = mean,
               geom = "bar",
               fill = "#377eb8") +
  stat_summary(fun.data = mean_se,
               geom = "errorbar",
               width = 0.1) +
  scale_y_continuous(breaks = seq(0, 0.15, 0.05),
                     expand = expansion(0,0),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(ylim = c(0, 0.16)) +
  labs(x = "",
       y = "COVID-19 infection") +
  theme(panel.background = element_blank(),
          axis.line = element_line(color = "black"),
          axis.ticks.x = element_blank(),
        axis.text.x = element_text(size = 11, color = "black")) # Like axis.title.x
)
```


### Panel D
```{r}
data_filtered <- data[complete.cases(data[c("will_get_tested_w1",
                                            "change_contacts_next_w1",
                                            "change_cleaning_hands_next_w1",
                                            "perception_gov_reg_w1",
                                            "change_con_wrt_gov_reg_w1",
                                            "patience_w1",
                                            "neg_reciprocity_w1",
                                            "altruism_w1")]), ]



# Standardize variables and calculate their z-scores
z_vars <- c("will_get_tested_w1",
            "change_contacts_next_w1",
            "change_cleaning_hands_next_w1",
            "perception_gov_reg_w1",
            "change_con_wrt_gov_reg_w1",
            "patience_w1",
            "neg_reciprocity_w1",
            "altruism_w1")
   

 
filter_complete_cases <- function(data, vars) {
  # Filter data to include only complete cases for specified variables
  filtered_data <- data[complete.cases(data[, vars]), ]

  return(filtered_data)
}
 
df_complete <- filter_complete_cases(data, c("will_get_tested_w1",
                                             "change_contacts_next_w1",
                                             "change_cleaning_hands_next_w1",
                                             "perception_gov_reg_w1",
                                             "change_con_wrt_gov_reg_w1",
                                             "patience_w1",
                                             "neg_reciprocity_w1",
                                             "altruism_w1") 
)


            
# Define function to calculate the z-score
calc_z_score <- function(x) {
  z_score <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
  return(z_score)
}

df_complete %<>%
  mutate(across(all_of(z_vars), ~ calc_z_score(x = .)))
```


```{r}
# Run regression models over the z-scores
regression_models <- map(z_vars, ~ lm(as.formula(paste(.x, "~ ten_tails_w1")), data = df_complete))

# Extract coefficients from each model
coefficients_list <- map(regression_models, broom::tidy)

# Combine the coefficients into a single dataframe
coefficients_df <- bind_rows(coefficients_list)
coefficients_df %<>% 
  filter(term == "ten_tails_w1")

# Add the names of the outcome variables
# coefficients_df$out_var <- z_vars
coefficients_df$out_var <- c(
  "Willingness to get tested",
	"Change in contacts (next week)",
	"Change in hand cleaning effort (next week)",
	"Perception towards gov. regulation",
	"Change in contacts relative to gov. reg.",
	"Patience",
	"Negative reciprocity",
	 "Altruism"
)

# Add group names for the outcome variables
coefficients_df$group_var <- c(
  "Reported behavior",
	"Reported behavior",
	"Reported behavior",
	"Reported behavior",
	"Reported behavior",
	"Personality",
	"Personality",
	"Personality"
)

# Make sure the order stays the same when using factor variables
coefficients_df %<>% mutate(id = row_number())
coefficients_df$out_var %<>% as.factor()
coefficients_df$group_var %<>% as.factor()
coefficients_df$group_var <- factor(coefficients_df$group_var, levels = c("Reported behavior", "Personality"))
levels(coefficients_df$group_var)

# Check table
coefficients_df %>% 
  mutate(estimate = round(estimate, 2),
         p.value = round(p.value, 3))
```


```{r}
# Plot regression results
(fig_d <- ggplot(coefficients_df, aes(y = forcats::fct_reorder(out_var, -id), x = estimate)) +
  geom_vline(xintercept = 0, color = "#c13405") +
  geom_point(color = "#1a476f") +
  geom_linerange(aes(xmin = estimate - std.error,
                     xmax = estimate + std.error),
                 color = "#1a476f") +
  labs(x = "Differences in z-score\n (10 tails vs. <10 tails)",
       y = "") + 
  scale_x_continuous(limits = c(-1, 1),
                     breaks = c(-1, 0, 1)) +
  theme(panel.background = element_blank(),
        axis.line = element_line(color = "black"),
        panel.grid.major.y = element_line(color = "#EAF2F3"),
        panel.grid.major.x = element_blank()) +
  facet_grid(group_var ~ .,
             switch = "x", # Place facet labels to right side (for the left side, use "y")
             scales = "free",
             space = "free")
)
```



### Combine all figures
```{r}
(plot <- ggdraw() +
  draw_plot(fig_a, x = 0, y = 0.55, width = 0.4, height = 0.45) +
  draw_plot(fig_b, x = 0.42, y = 0.55, width = 0.6, height = 0.43) +
  draw_plot(fig_c, x = 0, y = 0, width = 0.4, height = 0.55) +
  draw_plot(fig_d, x = 0.4, y = 0, width = 0.6, height = 0.55) +
  draw_plot_label(c("A", "C", "B", "D"),
                  x = c(0, 0.4, 0, 0.4),
                  y = c(1, 1, 0.55, 0.55),
                  size = 15))

# Save plot
ggsave(plot,
       filename = "Fig_1.pdf",
       path = here::here("3_Results"),
       width = 7.5, height = 4.5)
```




## Fig 2: Cross-country analysis

First, we obtain the cumulative number of excess deaths on December 2021. This data was generated and published by the WHO: Msemburi, W., Karlinsky, A., Knutson, V. et al. The WHO estimates of excess mortality associated with the COVID-19 pandemic. Nature 613, 130–137 (2023). https://doi.org/10.1038/s41586-022-05522-2).

```{r}
who_data_file <- here::here("1_Data", "WHO_excess_deaths.xlsx")

# Download WHO data if it does not yet exists
if (!file.exists(who_data_file)) {
  download.file("https://github.com/WHOexcessc19/Codebase/raw/main/Generated_Data/Estimate.Database.xlsx",
                destfile = who_data_file)
}

# Import WHO data
country_deaths <- readxl::read_excel(who_data_file, skip = 26)

# Rename some countries
country_deaths$location[country_deaths$location == "The United Kingdom"] <- "United Kingdom"
country_deaths$location[country_deaths$location == "United States of America"] <- "United States"
country_deaths$location[country_deaths$location == "Russian Federation"] <- "Russia"
country_deaths$location[country_deaths$location == "Viet Nam"] <- "Vietnam"

# Use mean cumulative excess deaths until end of 2021:
country_deaths %<>%
  filter(month == 12,
         year == 2021) %>% 
  select(location, iso3, c.excess.mean)

# Show list of countries
country_deaths %>%
  arrange(c.excess.mean)
```


# Next, we obtain the population size from [ourworldindata](https://ourworldindata.org/grapher/population?tab=table)
```{r}
# Import data
population_data <- read.csv(here::here("1_Data", "1_raw", "population.csv"))

# Filter for 2021 and rename variables
population_data %<>% 
  filter(Year == 2021) %>% 
  rename(country = Entity,
         iso3 = Code,
         population = Population..historical.estimates.)

# Show table
# population_data
```


# Finally, we obtain the honesty measure by Cohn et al.
```{r}

# Import data from cohn's wallets
cohn_wallets <- haven::read_dta(here::here("1_Data", "1_raw", "behavioral data (stata data file).dta"))

# Rename some countries
cohn_wallets$Country[cohn_wallets$Country == "UK"] <- "United Kingdom"
cohn_wallets$Country[cohn_wallets$Country == "UAE"] <- "United Arab Emirates"
cohn_wallets$Country[cohn_wallets$Country == "USA"] <- "United States"
cohn_wallets$Country[cohn_wallets$Country == "Czech Republic"] <- "Czechia"

# We keep condition 1
table(cohn_wallets$cond)

cohn_wallets %<>%
  filter(cond == 1)

# Key outcome variable is "response", we make it binary to (0,1) instead of (0, 100)
hist(cohn_wallets$response)
cohn_wallets$response[cohn_wallets$response == 100] <- 1

# Collapse to country level
country_wallets <- cohn_wallets %>% 
  group_by(Country) %>% 
  summarize(mean_response = mean(response, na.rm = TRUE))

# Show country list
country_wallets
hist(country_wallets$mean_response)
```


# We then merge all datasets and introduce a binary dummy variable for OECD countries.
```{r}
# Merge WHO and Cohn datasets
data_cross_countries <- left_join(x = country_deaths,
                                  y = country_wallets,
                                  by = c("location" = "Country"))

# Add population data
data_cross_countries %<>% left_join(y = population_data,
                                    by = "iso3")

# We calculate the excess rate = cumul.excess.mean / population
data_cross_countries %<>%
  mutate(excess_deaths_rate = c.excess.mean / population)

# Add dummy for OECD countries
oecd_countries <- tibble(
  oecd_member = 1,
  country_name = c("Australia", "Austria", "Belgium", "Canada", "Chile", "Colombia", "Costa Rica",
                   "Czechia", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
                   "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Republic of Korea",
                   "Latvia", "Lithuania", "Luxembourg", "Mexico", "Netherlands", "New Zealand",
                   "Norway", "Poland", "Portugal", "Slovakia", "Slovenia", "Spain", "Sweden",
                   "Switzerland", "Turkey", "United Kingdom", "United States"))

data_cross_countries %<>% left_join(y = oecd_countries,
                                    by = c("country" = "country_name"))

# Show table
data_cross_countries

# We use the following list of countries
data_cross_countries %>% 
  filter(oecd_member == 1,
         !is.na(mean_response),
         !is.na(c.excess.mean))
```


```{r}
# Create Figure
(plot_cross_countries <- data_cross_countries %>% 
  filter(oecd_member == 1) %>% 
  ggplot(aes(x = mean_response, y = excess_deaths_rate)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "darkgray") +
  geom_smooth(method = lm,
              se = TRUE,
              color = "#e41a1c",
              fill = "lightgray") +
  geom_point() +
  geom_text_repel(aes(label = iso3), size = 2.5, box.padding = 0.15) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 0.1),
                     limits = c(-0.0011, 0.006)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(x = "Civic honesty",
       y = "Excess death rate\n from COVID-19") +
  theme(panel.background = element_blank(),
        axis.line = element_line(color = "black"),
        axis.ticks.x = element_blank()))

# Save plot
 ggsave(plot_cross_countries,
       filename = "Fig_2.pdf",
        path = here::here("3_Results"),
       width = 5, height = 4)
```

